Filtering criteria:
Gandal:
Keep only control samples
Keep probes with expression > 0 in at least half of the samples
BrainSpan:
Keep only Temporal and Frontal lobe samples
Keep probes with mean expression larger than 0.005
Both:
# Load Gandal's Dataset
if(file.exists('./../Data/Gandal_RNASeq.RData')){
load('./../Data/Gandal_RNASeq.RData')
datExpr_Gandal = datExpr %>% dplyr::select(which(colnames(datExpr) %in% rownames(datMeta[datMeta$Diagnosis_=='CTL',])))
datMeta_Gandal = datMeta %>% filter(Diagnosis_=='CTL')
datProbes_Gandal = datProbes
DE_info_Gandal = DE_info
} else print('Gandal_RNASeq.RData does not exist. Find how to create it in 04_26_Gandal_RNASeq_exploratory_analysis.RData')
# Load BrainSpan's Dataset
if(file.exists('./../Data/BrainSpan_filtered.RData')){
load('./../Data/BrainSpan_filtered.RData')
datExpr_BrainSpan = datExpr
datMeta_BrainSpan = datMeta
datProbes_BrainSpan = datProbes
DE_info_BrainSpan = DE_info
} else print('BrainSpan_raw.RData does not exist. Find how to create it in 05_14_BrainSpan_exploratory_analysis.RData')
rm(datExpr, datMeta, datProbes, DE_info)
# Probe comparison
paste0('After filtering, Gandal has ', nrow(datExpr_Gandal),' probes and BrainSpan has ',
nrow(datExpr_BrainSpan),', of which they share ',
sum(rownames(datExpr_Gandal) %in% rownames(datExpr_BrainSpan)))
## [1] "After filtering, Gandal has 33478 probes and BrainSpan has 38790, of which they share 28654"
all_probes = unique(c(rownames(datExpr_Gandal), rownames(datExpr_BrainSpan)))
DE_genes_df = data.frame('Gandal' = all_probes %in% rownames(datExpr_Gandal),
'BrainSpan' = all_probes %in% rownames(datExpr_BrainSpan))
rownames(DE_genes_df) = all_probes
plot(venneuler(DE_genes_df))
# Filter to keep only common probes
datExpr_Gandal = datExpr_Gandal[rownames(datExpr_Gandal) %in% rownames(datExpr_BrainSpan),]
datProbes_Gandal = datProbes_Gandal[rownames(datProbes_Gandal) %in% rownames(datExpr_Gandal),]
datExpr_BrainSpan = datExpr_BrainSpan[rownames(datExpr_BrainSpan) %in% rownames(datExpr_Gandal),]
datProbes_BrainSpan = datProbes_BrainSpan[rownames(datProbes_BrainSpan) %in% rownames(datExpr_BrainSpan),]
# write.csv(rownames(datExpr_Gandal), file='./../Data/Gandal_BrainSpan_probes.csv', row.names=FALSE)
rm(all_probes, DE_genes_df)
Normalisation method: cqn from the cqn package (following Gandal’s preprocessing pipeline)
# Conditional Quantile Normalisation (CQN)
cqn.dat = cqn(datExpr_Gandal,lengths = as.numeric(datProbes_Gandal$length),
x = as.numeric(datProbes_Gandal$percentage_gc_content),
lengthMethod = c('smooth'), sqn = FALSE)
cqn.dat = cqn.dat$y + cqn.dat$offset # Get the log2(Normalized FPKM) values
datExpr_Gandal = cqn.dat
# # VST Normalisation
# counts = as.matrix(datExpr_Gandal)
# rowRanges = GRanges(datProbes_Gandal$chromosome_name,
# IRanges(datProbes_Gandal$start_position, width=datProbes_Gandal$length),
# strand=datProbes_Gandal$strand,
# feature_id=datProbes_Gandal$ensembl_gene_id)
# se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta_Gandal)
# dds = DESeqDataSet(se, design =~1)
# dds = estimateSizeFactors(dds)
# vst_output = vst(dds)
# datExpr_Gandal = assay(vst_output)
#
# rm(counts, rowRanges, se, dds, vst_output)
rm(cqn.dat)
Assigning a label to each subject, so different samples from the same subject share class, as it would happen with real ASD samples
Gandal dataset:
fake_class = data.frame('Subject_ID' = unique(datMeta_Gandal$Subject_ID),
'fake' = rbinom(length(unique(datMeta_Gandal$Subject_ID)), 1, 0.5))
datMeta_Gandal = datMeta_Gandal %>% left_join(fake_class, by='Subject_ID')
table(datMeta_Gandal$fake)
##
## 0 1
## 18 17
BrainSpan dataset:
fake_class = data.frame('donor_id' = unique(datMeta_BrainSpan$donor_id),
'fake' = rbinom(length(unique(datMeta_BrainSpan$donor_id)), 1, 0.5))
datMeta_BrainSpan = datMeta_BrainSpan %>% left_join(fake_class, by='donor_id')
table(datMeta_BrainSpan$fake)
##
## 0 1
## 112 189
Following Gandal’s preprocessing script (limma lmFit), removing robust parameter to improve running time for BrainSpan
# Gandal dataset:
mod = model.matrix(~fake, data=datMeta_Gandal)
corfit = duplicateCorrelation(datExpr_Gandal, mod, block=datMeta_Gandal$Subject_ID)
lmfit = lmFit(datExpr_Gandal, design=mod, block=datMeta_Gandal$Subject_ID, correlation=corfit$consensus)
fit = eBayes(lmfit, trend=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr_Gandal))
new_DE_info_Gandal = top_genes[match(rownames(datExpr_Gandal), rownames(top_genes)),]
new_DE_info_Gandal$signif_p_val = new_DE_info_Gandal$adj.P.Val<0.05
new_DE_info_Gandal$signif_lfc = abs(new_DE_info_Gandal$logFC)>log2(1.2)
new_DE_info_Gandal$signif = new_DE_info_Gandal$signif_p_val & new_DE_info_Gandal$signif_lfc
new_DE_info_Gandal = new_DE_info_Gandal %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>%
dplyr::select(ID, logFC, AveExpr, P.Value, adj.P.Val, signif_p_val, signif_lfc, signif,
`gene-score`, syndromic) %>% mutate(`gene-score` = as.factor(`gene-score`))
rm(mod, corfit, lmfit, fit, top_genes)
# BrainSpan dataset:
mod = model.matrix(~fake, data=datMeta_BrainSpan)
corfit = duplicateCorrelation(log2(datExpr_BrainSpan+1), mod, block=datMeta_BrainSpan$donor_id)
lmfit = lmFit(log2(datExpr_BrainSpan+1), design=mod, block=datMeta_BrainSpan$donor_id, correlation=corfit$consensus)
fit = eBayes(lmfit, trend=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr_BrainSpan))
new_DE_info_BrainSpan = top_genes[match(rownames(datExpr_BrainSpan), rownames(top_genes)),]
new_DE_info_BrainSpan$signif_p_val = new_DE_info_BrainSpan$adj.P.Val<0.05
new_DE_info_BrainSpan$signif_lfc = abs(new_DE_info_BrainSpan$logFC)>log2(1.2)
new_DE_info_BrainSpan$signif = new_DE_info_BrainSpan$signif_p_val & new_DE_info_BrainSpan$signif_lfc
new_DE_info_BrainSpan = new_DE_info_BrainSpan %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>%
dplyr::select(ID, logFC, AveExpr, P.Value, adj.P.Val, signif_p_val, signif_lfc, signif,
`gene-score`, syndromic) %>% mutate(`gene-score` = as.factor(`gene-score`))
rm(mod, corfit, lmfit, fit, top_genes)
In the Gandal dataset the higher the SFARI score, the lower the logFC, but this pattern is not as clear in the BrainSpan dataset. This could be because the standard deviation in BrainSpan doesn’t seem to be affected as much with the log2 transformation, staying relatively constant for all scores, while it got inverted in Gandal’s, which in turn is a consequence of the relation between the mean and standard deviation of the probes.
p1 = ggplotly(new_DE_info_Gandal %>% ggplot(aes(`gene-score`, abs(logFC), fill=`gene-score`)) + geom_boxplot() +
scale_color_manual(values=gg_colour_hue(9)) + theme_minimal() + theme(legend.position = 'none') +
ggtitle('logFC in Gandal dataset (left) and in BrainSpan (right)'))
p2 = ggplotly(new_DE_info_BrainSpan %>% ggplot(aes(`gene-score`, abs(logFC), fill=`gene-score`)) + geom_boxplot() +
scale_color_manual(values=gg_colour_hue(9)) + theme_minimal() + theme(legend.position = 'none') +
ggtitle('logFC in Gandal dataset (left) and in BrainSpan (right)'))
subplot(p1, p2, nrows=1)
rm(p1, p2)
There isn’t a strong correlation between logFC in the two datasets
plot_data = new_DE_info_Gandal %>% left_join(new_DE_info_BrainSpan, by='ID') %>%
mutate('gene-score'=as.factor(ifelse(is.na(`gene-score.x`), 'NA', `gene-score.x`)))
corr = cor(plot_data$logFC.x, plot_data$logFC.y)
ggplotly(plot_data %>% ggplot(aes(logFC.x, logFC.y, color=`gene-score`)) + geom_point(alpha=0.2) +
geom_hline(yintercept=log2(1.2), color='#cc0099') + geom_hline(yintercept=-log2(1.2), color='#cc0099') +
geom_vline(xintercept=log2(1.2), color='#cc0099') + geom_vline(xintercept=-log2(1.2), color='#cc0099') +
ggtitle(paste0('logFC comparison by gene corr=', round(corr,4))) + scale_color_manual(values=gg_colour_hue(9)) +
xlab('Gandal') + ylab('BrainSpan') + theme_minimal())
rm(corr)
for(score in sort(unique(plot_data$`gene-score`))){
corr = cor(plot_data$logFC.x[plot_data$`gene-score`==score], plot_data$logFC.y[plot_data$`gene-score`==score])
print(paste0('Correlation = ', round(corr,4), ' for SFARI score ', score))
}
## [1] "Correlation = -0.4944 for SFARI score 1"
## [1] "Correlation = -0.0112 for SFARI score 2"
## [1] "Correlation = -0.131 for SFARI score 3"
## [1] "Correlation = -0.152 for SFARI score 4"
## [1] "Correlation = -0.2001 for SFARI score 5"
## [1] "Correlation = -0.1127 for SFARI score 6"
## [1] "Correlation = -0.0749 for SFARI score NA"
Concordance in significance of genes based in log Fold Change
table(plot_data$signif_lfc.x, plot_data$signif_lfc.y)/nrow(plot_data)*100
##
## FALSE TRUE
## FALSE 64.83931 12.51352
## TRUE 20.48016 2.16701
For both datasets, almost no genes have a significant adjusted p-value (makes sense because the classifier has no biological meaning)
summary(plot_data$signif_p_val.x)
## Mode FALSE
## logical 28657
summary(plot_data$signif_p_val.y)
## Mode FALSE TRUE
## logical 28656 1